In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
In [2]:
# Set display options for better visualization
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.2f}'.format)

DATA UNDERSTANDING AND CLEANING PROCESS FOR VIRAL SOCIAL MEDIA TRENDS DATASET¶

In [4]:
# Load the dataset
df = pd.read_csv('Viral_Social_Media_Trends.csv')
In [ ]:
 
In [9]:
# Basic information about the dataset
print("Dataset Information:")
print(df.info())
Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Post_ID           5000 non-null   object
 1   Platform          5000 non-null   object
 2   Hashtag           5000 non-null   object
 3   Content_Type      5000 non-null   object
 4   Region            5000 non-null   object
 5   Views             5000 non-null   int64 
 6   Likes             5000 non-null   int64 
 7   Shares            5000 non-null   int64 
 8   Comments          5000 non-null   int64 
 9   Engagement_Level  5000 non-null   object
dtypes: int64(4), object(6)
memory usage: 390.8+ KB
None
In [ ]:
 
In [12]:
# Statistical summary
print("\nStatistical Summary:")
print(df.describe())
Statistical Summary:
           Views     Likes   Shares  Comments
count    5000.00   5000.00  5000.00   5000.00
mean  2494066.44 251475.03 50519.56  24888.39
std   1459489.82 144349.58 29066.36  14284.50
min      1266.00    490.00    52.00     18.00
25%   1186207.25 126892.25 25029.00  12305.25
50%   2497373.00 249443.00 50839.50  25004.00
75%   3759781.00 373970.75 75774.25  37072.75
max   4999430.00 499922.00 99978.00  49993.00
In [ ]:
 
In [15]:
# Check for missing values
print("\nMissing values in each column:")
print(df.isnull().sum())
Missing values in each column:
Post_ID             0
Platform            0
Hashtag             0
Content_Type        0
Region              0
Views               0
Likes               0
Shares              0
Comments            0
Engagement_Level    0
dtype: int64
In [17]:
# Check for duplicates
print(f"\nNumber of duplicate rows: {df.duplicated().sum()}")
Number of duplicate rows: 0
In [19]:
# Handle missing values in key columns
# For Content_Type and Platform, fill with the most frequent value
if df['Content_Type'].isnull().sum() > 0:
    most_common_content = df['Content_Type'].mode()[0]
    df['Content_Type'] = df['Content_Type'].fillna(most_common_content)
    print(f"Filled missing Content_Type values with: {most_common_content}")

if df['Platform'].isnull().sum() > 0:
    most_common_platform = df['Platform'].mode()[0]
    df['Platform'] = df['Platform'].fillna(most_common_platform)
    print(f"Filled missing Platform values with: {most_common_platform}")

# For engagement metrics, fill with median values
engagement_metrics = ['Views', 'Likes', 'Shares', 'Comments']
for metric in engagement_metrics:
    if df[metric].isnull().sum() > 0:
        median_value = df[metric].median()
        df[metric] = df[metric].fillna(median_value)
        print(f"Filled missing {metric} values with median: {median_value}")
In [21]:
# Remove duplicates if any
if df.duplicated().sum() > 0:
    df = df.drop_duplicates()
    print(f"Removed duplicate rows. New shape: {df.shape}")
In [23]:
# Convert text columns to lowercase for consistency
text_columns = ['Platform', 'Hashtag', 'Content_Type', 'Region', 'Engagement_Level']
for col in text_columns:
    df[col] = df[col].str.lower()
In [25]:
# Standardize hashtags (remove # symbol if present)
df['Hashtag'] = df['Hashtag'].str.replace('#', '')
In [27]:
# Calculate additional engagement metrics
# Total Engagement
df['Total_Engagement'] = df['Likes'] + df['Shares'] + df['Comments']
In [29]:
# Engagement Rate (as percentage of views)
df['Engagement_Rate'] = (df['Total_Engagement'] / df['Views']) * 100
In [31]:
# Identify outliers using IQR method
numeric_columns = ['Views', 'Likes', 'Shares', 'Comments', 'Total_Engagement', 'Engagement_Rate']
outliers_summary = {}
In [33]:
for col in numeric_columns:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    outliers_summary[col] = len(outliers)
    
    print(f"\nColumn: {col}")
    print(f"Number of outliers: {len(outliers)}")
    print(f"Percentage of outliers: {100 * len(outliers) / len(df):.2f}%")
    print(f"Range: [{lower_bound:.2f}, {upper_bound:.2f}]")
Column: Views
Number of outliers: 0
Percentage of outliers: 0.00%
Range: [-2674153.38, 7620141.62]

Column: Likes
Number of outliers: 0
Percentage of outliers: 0.00%
Range: [-243725.50, 744588.50]

Column: Shares
Number of outliers: 0
Percentage of outliers: 0.00%
Range: [-51088.88, 151892.12]

Column: Comments
Number of outliers: 0
Percentage of outliers: 0.00%
Range: [-24846.00, 74224.00]

Column: Total_Engagement
Number of outliers: 0
Percentage of outliers: 0.00%
Range: [-170381.62, 822687.38]

Column: Engagement_Rate
Number of outliers: 595
Percentage of outliers: 11.90%
Range: [-21.13, 55.88]
In [35]:
# Final dataset summary
print("\nFinal dataset shape:", df.shape)
print("\nFinal dataset information:")
print(df.info())
Final dataset shape: (5000, 12)

Final dataset information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Post_ID           5000 non-null   object 
 1   Platform          5000 non-null   object 
 2   Hashtag           5000 non-null   object 
 3   Content_Type      5000 non-null   object 
 4   Region            5000 non-null   object 
 5   Views             5000 non-null   int64  
 6   Likes             5000 non-null   int64  
 7   Shares            5000 non-null   int64  
 8   Comments          5000 non-null   int64  
 9   Engagement_Level  5000 non-null   object 
 10  Total_Engagement  5000 non-null   int64  
 11  Engagement_Rate   5000 non-null   float64
dtypes: float64(1), int64(5), object(6)
memory usage: 468.9+ KB
None
In [37]:
# Check if there are any remaining missing values
print("\nRemaining missing values:")
print(df.isnull().sum())
Remaining missing values:
Post_ID             0
Platform            0
Hashtag             0
Content_Type        0
Region              0
Views               0
Likes               0
Shares              0
Comments            0
Engagement_Level    0
Total_Engagement    0
Engagement_Rate     0
dtype: int64

EXPLORATORY DATA ANALYSIS¶

In [40]:
print("\n" + "="*50)
print("EXPLORATORY DATA ANALYSIS")
print("="*50)
==================================================
EXPLORATORY DATA ANALYSIS
==================================================
In [42]:
# 1. Understanding structure and summary statistics
print("\n1. DATASET STRUCTURE AND SUMMARY")
print("-"*30)
1. DATASET STRUCTURE AND SUMMARY
------------------------------
In [44]:
# Count of rows and columns
rows, cols = df.shape
print(f"Dataset dimensions: {rows} rows x {cols} columns")
Dataset dimensions: 5000 rows x 12 columns
In [46]:
# Data types
print("\nData types:")
print(df.dtypes)
Data types:
Post_ID              object
Platform             object
Hashtag              object
Content_Type         object
Region               object
Views                 int64
Likes                 int64
Shares                int64
Comments              int64
Engagement_Level     object
Total_Engagement      int64
Engagement_Rate     float64
dtype: object
In [48]:
# Summary statistics of numeric columns
print("\nSummary statistics of numeric columns:")
print(df.describe())
Summary statistics of numeric columns:
           Views     Likes   Shares  Comments  Total_Engagement  \
count    5000.00   5000.00  5000.00   5000.00           5000.00   
mean  2494066.44 251475.03 50519.56  24888.39         326882.99   
std   1459489.82 144349.58 29066.36  14284.50         148089.98   
min      1266.00    490.00    52.00     18.00          12418.00   
25%   1186207.25 126892.25 25029.00  12305.25         202019.25   
50%   2497373.00 249443.00 50839.50  25004.00         327743.50   
75%   3759781.00 373970.75 75774.25  37072.75         450286.50   
max   4999430.00 499922.00 99978.00  49993.00         635139.00   

       Engagement_Rate  
count          5000.00  
mean             56.76  
std             486.21  
min               0.30  
25%               7.75  
50%              12.91  
75%              27.00  
max           28174.17  
In [50]:
# 2. Count unique values in categorical columns
print("\n2. CATEGORICAL DISTRIBUTIONS")
print("-"*30)
2. CATEGORICAL DISTRIBUTIONS
------------------------------
In [52]:
# Count unique platforms
print(f"\nNumber of unique platforms: {df['Platform'].nunique()}")
print("Platform distribution:")
platform_counts = df['Platform'].value_counts()
print(platform_counts)
Number of unique platforms: 4
Platform distribution:
Platform
youtube      1324
tiktok       1260
instagram    1212
twitter      1204
Name: count, dtype: int64
In [54]:
# Visualize platform distribution
plt.figure(figsize=(10, 6))
platform_counts.plot(kind='bar')
plt.title('Distribution of Social Media Platforms')
plt.xlabel('Platform')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('platform_distribution.png')
plt.show()
plt.close()
No description has been provided for this image
In [ ]:
 
In [57]:
# Count unique hashtags
print(f"\nNumber of unique hashtags: {df['Hashtag'].nunique()}")
print("Top 10 hashtags:")
hashtag_counts = df['Hashtag'].value_counts().head(10)
print(hashtag_counts)
Number of unique hashtags: 10
Top 10 hashtags:
Hashtag
fitness      536
education    525
challenge    507
comedy       505
dance        496
music        493
tech         491
fashion      487
viral        481
gaming       479
Name: count, dtype: int64
In [59]:
# Visualize top hashtags
plt.figure(figsize=(12, 6))
hashtag_counts.plot(kind='bar')
plt.title('Top 10 Hashtags')
plt.xlabel('Hashtag')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('top_hashtags.png')
plt.show()
plt.close()
No description has been provided for this image
In [61]:
# Count unique content types
print(f"\nNumber of unique content types: {df['Content_Type'].nunique()}")
print("Content type distribution:")
content_counts = df['Content_Type'].value_counts()
print(content_counts)
Number of unique content types: 6
Content type distribution:
Content_Type
live stream    855
post           853
reel           841
tweet          836
video          828
shorts         787
Name: count, dtype: int64
In [63]:
# Visualize content type distribution
plt.figure(figsize=(10, 6))
content_counts.plot(kind='bar')
plt.title('Distribution of Content Types')
plt.xlabel('Content Type')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('content_type_distribution.png')
plt.show()
plt.close()
No description has been provided for this image
In [ ]:
 
In [66]:
# 3. Analyze distributions of engagement metrics
print("\n3. ENGAGEMENT METRICS DISTRIBUTIONS")
print("-"*30)
3. ENGAGEMENT METRICS DISTRIBUTIONS
------------------------------
In [68]:
# Create histograms for all engagement metrics
plt.figure(figsize=(20, 15))

for i, metric in enumerate(numeric_columns, 1):
    plt.subplot(3, 2, i)
    sns.histplot(df[metric], kde=True)
    plt.title(f'Distribution of {metric}')
    plt.xlabel(metric)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.savefig('engagement_distributions.png')
plt.show()
plt.close()
No description has been provided for this image
In [69]:
# Log-transformed distributions for better visualization
# Social media metrics often follow power law distributions
plt.figure(figsize=(20, 15))

for i, metric in enumerate(['Views', 'Likes', 'Shares', 'Comments', 'Total_Engagement'], 1):
    plt.subplot(3, 2, i)
    # Add 1 to avoid log(0) issues
    sns.histplot(np.log1p(df[metric]), kde=True)
    plt.title(f'Log Distribution of {metric}')
    plt.xlabel(f'Log({metric})')
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()
plt.savefig('log_engagement_distributions.png')
plt.close()
No description has been provided for this image
In [ ]:
 
In [71]:
# 4. Explore most engaging post types and platforms
print("\n4. MOST ENGAGING CONTENT ANALYSIS")
print("-"*30)
4. MOST ENGAGING CONTENT ANALYSIS
------------------------------
In [72]:
# Average engagement by platform
print("\nAverage engagement metrics by platform:")
platform_engagement = df.groupby('Platform')[numeric_columns].mean().sort_values('Engagement_Rate', ascending=False)
print(platform_engagement)
Average engagement metrics by platform:
               Views     Likes   Shares  Comments  Total_Engagement  \
Platform                                                              
instagram 2404079.88 257118.22 50310.91  24958.11         332387.24   
tiktok    2515015.40 244206.72 51468.26  24778.70         320453.67   
youtube   2545648.40 258314.00 50073.09  25321.39         333708.48   
twitter   2506004.59 245880.12 50227.75  24456.86         320564.73   

           Engagement_Rate  
Platform                    
instagram            66.21  
tiktok               63.87  
youtube              50.64  
twitter              46.54  
In [73]:
# Visualize platform engagement
plt.figure(figsize=(12, 6))
platform_engagement['Engagement_Rate'].plot(kind='bar')
plt.title('Average Engagement Rate by Platform')
plt.xlabel('Platform')
plt.ylabel('Engagement Rate (%)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
plt.savefig('platform_engagement.png')
plt.close()
No description has been provided for this image
In [74]:
# Average engagement by content type
print("\nAverage engagement metrics by content type:")
content_engagement = df.groupby('Content_Type')[numeric_columns].mean().sort_values('Engagement_Rate', ascending=False)
print(content_engagement)
Average engagement metrics by content type:
                  Views     Likes   Shares  Comments  Total_Engagement  \
Content_Type                                                             
shorts       2486350.72 247926.20 49593.95  25060.41         322580.57   
live stream  2481009.01 248811.62 49921.96  25167.64         323901.21   
tweet        2500386.93 249384.47 51427.45  24635.99         325447.91   
post         2474337.32 252784.17 50164.20  24554.97         327503.34   
video        2489008.86 254768.92 50614.39  24694.61         330077.93   
reel         2533268.68 255011.07 51357.87  25223.41         331592.34   

              Engagement_Rate  
Content_Type                   
shorts                  80.63  
live stream             61.88  
tweet                   52.07  
post                    49.81  
video                   48.82  
reel                    48.75  
In [75]:
# Visualize content type engagement
plt.figure(figsize=(12, 6))
content_engagement['Engagement_Rate'].plot(kind='bar')
plt.title('Average Engagement Rate by Content Type')
plt.xlabel('Content Type')
plt.ylabel('Engagement Rate (%)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
plt.savefig('content_engagement.png')
plt.close()
No description has been provided for this image
In [76]:
# Average engagement by hashtag (top 10)
print("\nAverage engagement metrics by top hashtags:")
hashtag_engagement = df.groupby('Hashtag')[numeric_columns].mean().sort_values('Engagement_Rate', ascending=False).head(10)
print(hashtag_engagement)
Average engagement metrics by top hashtags:
               Views     Likes   Shares  Comments  Total_Engagement  \
Hashtag                                                               
viral     2437590.28 249964.49 50428.89  24776.51         325169.90   
comedy    2450141.71 253615.75 49418.05  24798.89         327832.68   
gaming    2500698.95 242819.59 51360.82  25332.24         319512.65   
fitness   2599391.00 253130.12 51183.12  24532.69         328845.94   
tech      2516381.46 260347.15 48287.21  25799.55         334433.91   
education 2531227.84 260423.42 51748.70  24684.92         336857.05   
fashion   2426830.62 249970.58 50956.94  24206.32         325133.85   
dance     2447362.77 254484.89 49557.96  24565.04         328607.89   
challenge 2451335.16 241212.44 51182.43  25207.85         317602.72   
music     2568758.78 248012.82 50965.63  25024.50         324002.94   

           Engagement_Rate  
Hashtag                     
viral               100.47  
comedy               66.96  
gaming               62.07  
fitness              54.41  
tech                 53.61  
education            50.87  
fashion              48.84  
dance                46.66  
challenge            45.96  
music                39.55  
In [78]:
# Visualize hashtag engagement
plt.figure(figsize=(12, 6))
hashtag_engagement['Engagement_Rate'].plot(kind='bar')
plt.title('Average Engagement Rate by Top Hashtags')
plt.xlabel('Hashtag')
plt.ylabel('Engagement Rate (%)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
plt.savefig('hashtag_engagement.png')
plt.close()
No description has been provided for this image
In [132]:
# Load and group the data
df = pd.read_csv('Viral_Social_Media_Trends.csv')
df_avg = df.groupby('Platform')[['Likes', 'Shares', 'Comments']].mean().reset_index()

# Prepare data
labels = df_avg['Platform']
likes = df_avg['Likes']
shares = df_avg['Shares']
comments = df_avg['Comments']
x = np.arange(len(labels))
width = 0.25

# Use distinct colors for each metric
color_likes = '#1f77b4'     # blue
color_shares = '#2ca02c'    # green
color_comments = '#d62728'  # red

# Plot
fig, ax = plt.subplots(figsize=(10, 6))
ax.bar(x - width, likes, width, label='Likes', color=color_likes)
ax.bar(x, shares, width, label='Shares', color=color_shares)
ax.bar(x + width, comments, width, label='Comments', color=color_comments)

# Final touches
ax.set_ylabel('Average Count')
ax.set_title('Average Likes, Shares, & Comments by Platform')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend(title='Metric')

plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
 
In [88]:
# 5. Platform and Content Type Analysis
print("\n5. PLATFORM AND CONTENT TYPE RELATIONSHIPS")
print("-"*30)
5. PLATFORM AND CONTENT TYPE RELATIONSHIPS
------------------------------
In [90]:
# Create a heatmap of platform vs content type counts
platform_content_counts = pd.crosstab(df['Platform'], df['Content_Type'])
print("\nPlatform vs Content Type counts:")
print(platform_content_counts)
Platform vs Content Type counts:
Content_Type  live stream  post  reel  shorts  tweet  video
Platform                                                   
instagram             211   191   214     196    188    212
tiktok                207   203   210     208    218    214
twitter               206   221   202     166    211    198
youtube               231   238   215     217    219    204
In [92]:
# Visualize as heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(platform_content_counts, annot=True, cmap='YlGnBu', fmt='d')
plt.title('Platform vs Content Type Distribution')
plt.tight_layout()
plt.show()
plt.savefig('platform_content_heatmap.png')
plt.close()
No description has been provided for this image
In [ ]:
 
In [95]:
# Create a heatmap of average engagement rate by platform and content type
platform_content_engagement = df.pivot_table(
    index='Platform', 
    columns='Content_Type', 
    values='Engagement_Rate', 
    aggfunc='mean'
)
print("\nAverage engagement rate by platform and content type:")
print(platform_content_engagement)
Average engagement rate by platform and content type:
Content_Type  live stream  post  reel  shorts  tweet  video
Platform                                                   
instagram           93.34 53.64 67.87   69.01  83.43  30.98
tiktok              30.95 39.66 32.56  168.91  44.70  66.83
twitter             32.75 43.98 45.90   54.16  46.73  57.82
youtube             86.83 60.79 48.23   26.74  37.62  39.72
In [97]:
# Visualize as heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(platform_content_engagement, annot=True, cmap='YlOrRd', fmt='.2f')
plt.title('Average Engagement Rate by Platform and Content Type')
plt.tight_layout()
plt.show()
plt.savefig('platform_content_engagement_heatmap.png')
plt.close()
No description has been provided for this image
In [ ]:
 
In [100]:
# 6. Top performing posts
print("\n6. TOP PERFORMING POSTS")
print("-"*30)

# Top 10 posts by total engagement
print("\nTop 10 posts by total engagement:")
top_engagement = df.sort_values('Total_Engagement', ascending=False).head(10)
print(top_engagement[['Post_ID', 'Platform', 'Content_Type', 'Hashtag', 'Total_Engagement', 'Views']])

# Top 10 posts by engagement rate
print("\nTop 10 posts by engagement rate:")
top_rate = df.sort_values('Engagement_Rate', ascending=False).head(10)
print(top_rate[['Post_ID', 'Platform', 'Content_Type', 'Hashtag', 'Engagement_Rate', 'Views']])
6. TOP PERFORMING POSTS
------------------------------

Top 10 posts by total engagement:
        Post_ID   Platform Content_Type    Hashtag  Total_Engagement    Views
2907  Post_2908  instagram  live stream  challenge            635139  4498056
4960  Post_4961  instagram        tweet       tech            630930  2964544
3619  Post_3620    youtube         reel      dance            630822  3371356
1871  Post_1872  instagram       shorts       tech            630436   301575
556    Post_557  instagram         post    fitness            627178   129205
2864  Post_2865  instagram         reel      viral            626538  1831119
4759  Post_4760    youtube        tweet      music            625382  2050005
2492  Post_2493    twitter         reel    fashion            624224  3802774
3016  Post_3017    twitter        tweet    fashion            622586  1072473
8        Post_9    twitter  live stream  education            622293  2169523

Top 10 posts by engagement rate:
        Post_ID   Platform Content_Type    Hashtag  Engagement_Rate  Views
4826  Post_4827     tiktok       shorts      viral         28174.17   1266
1540  Post_1541  instagram  live stream     comedy          9569.16   4323
3686  Post_3687    youtube  live stream       tech          8250.61   5467
2647  Post_2648    youtube         post  challenge          5237.75   8982
4691  Post_4692    twitter       shorts  education          4778.18   8162
1033  Post_1034  instagram        tweet  education          4499.51  11338
4991  Post_4992  instagram       shorts    fashion          4331.54  10157
4137  Post_4138  instagram        tweet     comedy          4081.21  13578
4284  Post_4285    youtube  live stream     gaming          3714.53   7810
4571  Post_4572     tiktok        video     gaming          3494.10   5679
In [102]:
# Final dataset summary
print("\nFinal dataset shape:", df.shape)
print("\nFinal dataset information:")
print(df.info())
# Justification: After all transformations, it's important to review the final
# dataset structure to ensure all changes were applied as expected.
Final dataset shape: (5000, 12)

Final dataset information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Post_ID           5000 non-null   object 
 1   Platform          5000 non-null   object 
 2   Hashtag           5000 non-null   object 
 3   Content_Type      5000 non-null   object 
 4   Region            5000 non-null   object 
 5   Views             5000 non-null   int64  
 6   Likes             5000 non-null   int64  
 7   Shares            5000 non-null   int64  
 8   Comments          5000 non-null   int64  
 9   Engagement_Level  5000 non-null   object 
 10  Total_Engagement  5000 non-null   int64  
 11  Engagement_Rate   5000 non-null   float64
dtypes: float64(1), int64(5), object(6)
memory usage: 468.9+ KB
None
In [104]:
# Check if there are any remaining missing values
print("\nRemaining missing values:")
print(df.isnull().sum())
# Justification: Confirming no missing values remain ensures that the dataset
# is complete and ready for analysis. This is a final quality check.
Remaining missing values:
Post_ID             0
Platform            0
Hashtag             0
Content_Type        0
Region              0
Views               0
Likes               0
Shares              0
Comments            0
Engagement_Level    0
Total_Engagement    0
Engagement_Rate     0
dtype: int64
In [106]:
# Save the cleaned dataset
df.to_csv('Viral_Social_Media_Trends_Cleaned.csv', index=False)
print("Cleaned dataset saved as 'Viral_Social_Media_Trends_Cleaned.csv'")
# Justification: Saving the cleaned dataset preserves the work done and creates
# a clean version for further analysis. Not including the index (index=False)
# avoids adding an unnecessary column to the saved file. 
Cleaned dataset saved as 'Viral_Social_Media_Trends_Cleaned.csv'
In [114]:
import pandas as pd
import plotly.express as px

# Load the Viral Social Media Trends dataset
df_trends = pd.read_csv('Viral_Social_Media_Trends.csv')

# Display first few rows
df_trends.head()
Out[114]:
Post_ID Platform Hashtag Content_Type Region Views Likes Shares Comments Engagement_Level
0 Post_1 TikTok #Challenge Video UK 4163464 339431 53135 19346 High
1 Post_2 Instagram #Education Shorts India 4155940 215240 65860 27239 Medium
2 Post_3 Twitter #Challenge Video Brazil 3666211 327143 39423 36223 Medium
3 Post_4 YouTube #Education Shorts Australia 917951 127125 11687 36806 Low
4 Post_5 TikTok #Dance Post Brazil 64866 171361 69581 6376 Medium
In [116]:
# Group data for Geo Map
df_geo = df_trends.groupby(['Region', 'Engagement_Level']).agg({
    'Views': 'mean',
    'Likes': 'mean',
    'Shares': 'mean',
    'Comments': 'mean'
}).reset_index()

# Animated Scatter Geo Map
fig_geo = px.scatter_geo(df_geo,
                         locations="Region",
                         locationmode='country names',
                         color="Views",
                         hover_name="Region",
                         size="Likes",
                         animation_frame="Engagement_Level",
                         projection="natural earth",
                         title='🌍 Viral Post Views by Region and Engagement Level',
                         color_continuous_scale="thermal",
                         range_color=(df_geo['Views'].min(), df_geo['Views'].max()),
                         height=600)

fig_geo.update_layout(geo=dict(showframe=False, showcoastlines=True))
fig_geo.show()
In [118]:
# Group data for Line Chart
df_line = df_trends.groupby(['Platform', 'Engagement_Level']).agg({
    'Views': 'mean',
    'Likes': 'mean',
    'Shares': 'mean',
    'Comments': 'mean'
}).reset_index()

# Animated Line Chart
fig_line = px.line(df_line,
                   x='Platform',
                   y='Views',
                   color='Platform',
                   animation_frame='Engagement_Level',
                   markers=True,
                   title='📈 Animated Platform-Wise Views by Engagement Level',
                   labels={'Views': 'Average Views'},
                   height=600)

fig_line.update_layout(transition={'duration': 500})
fig_line.show()
In [120]:
# Animated Bar Chart
fig_bar = px.bar(df_geo,
                 x='Views',
                 y='Region',
                 color='Region',
                 orientation='h',
                 animation_frame='Engagement_Level',
                 title='🏆 Top Regions by Average Views (Animated by Engagement Level)',
                 labels={'Views': 'Average Views'},
                 height=600)

fig_bar.update_layout(transition={'duration': 500}, yaxis={'categoryorder':'total ascending'})
fig_bar.show()
In [122]:
# Animated Bubble Chart
fig_bubble = px.scatter(df_trends,
                        x='Likes',
                        y='Shares',
                        size='Comments',
                        color='Platform',
                        hover_name='Post_ID',
                        animation_frame='Engagement_Level',
                        title='🔵 Likes vs Shares (Bubble Size = Comments)',
                        size_max=60,
                        height=600)

fig_bubble.update_layout(transition={'duration': 500})
fig_bubble.show()
In [126]:
# Prepare data for Area Chart
df_trends['Total_Engagement'] = df_trends['Views'] + df_trends['Likes'] + df_trends['Shares'] + df_trends['Comments']
df_area = df_trends.groupby(['Platform', 'Engagement_Level']).agg({
    'Total_Engagement': 'mean'
}).reset_index()

# Animated Area Chart
fig_area = px.area(df_area,
                   x='Platform',
                   y='Total_Engagement',
                   color='Platform',
                   animation_frame='Engagement_Level',
                   title='🏔️ Total Engagement Across Platforms (Animated)',
                   height=600)

fig_area.update_layout(transition={'duration': 500})
fig_area.show()
In [ ]: